notebook.community

Edit and run



In [19]:

    
import os

import pandas as pd
import numpy as np

import seaborn as sns; sns.set()

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix

from statsmodels.discrete.discrete_model import Logit, LogitResults



In [5]:

    
pd.set_option('max_columns', 200)
pd.set_option('max_rows', 200)
pd.set_option('display.float_format', '{:.2f}'.format)



In [7]:

    
df = pd.read_pickle('../Chapter 7 - Data Preparation and Visualization/claims_df')



In [32]:

    
df.shape









    Out[32]:





(4700, 38)



In [9]:

    
disease = ['SP_ALZHDMTA','SP_CHF', 'SP_CHRNKIDN', 'SP_CNCR', 'SP_COPD', 'SP_DEPRESSN','SP_DIABETES', 'SP_ISCHMCHT', 'SP_OSTEOPRS', 'SP_RA_OA', 'SP_STRKETIA']
gender = ['gender_2']
ESRD = ['ESRD_Y']



In [10]:

    
X = df[disease+gender+ESRD]
y = df.HIGH_COST



In [11]:

    
lab_enc = LabelEncoder()



In [12]:

    
lab_enc.fit_transform(y)









    Out[12]:





array([0, 0, 0, ..., 0, 0, 0])



In [110]:

    
lm = LogisticRegression()



In [111]:

    
lm.fit(X, y)









    Out[111]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [112]:

    
lm.score(X, y)









    Out[112]:





0.9027659574468085



In [113]:

    
def generate_conf_mat(model, X, y):
    conf_mat = confusion_matrix(model.predict(X), y)
    conf_mat = pd.DataFrame(conf_mat/conf_mat.sum())
    conf_mat.columns = ['0_pred', '1_pred']
    conf_mat.index=['0_actual', '1_actual']
    return conf_mat



In [114]:

    
conf_mat = generate_conf_mat(lm, X, y)



In [115]:

    
conf_mat









    Out[115]:







  
    
      
      0_pred
      1_pred
    
  
  
    
      0_actual
      0.88
      0.08
    
    
      1_actual
      0.02
      0.02

Use statsmodels



In [116]:

    
lm2 = Logit(y, X)



In [117]:

    
results = lm2.fit()









    



Optimization terminated successfully.
         Current function value: 0.421470
         Iterations 7



In [118]:

    
results.summary()









    Out[118]:





Logit Regression Results

  Dep. Variable:      HIGH_COST       No. Observations:      4700 


  Model:                Logit         Df Residuals:          4687 


  Method:                MLE          Df Model:                12 


  Date:           Thu, 21 Jun 2018    Pseudo R-squ.:       -0.2946


  Time:               11:27:17        Log-Likelihood:      -1980.9


  converged:            True          LL-Null:             -1530.1


                                    LLR p-value:          1.000 




                 coef      std err       z       P>|z|   [0.025     0.975]  


  SP_ALZHDMTA     -0.0405      0.101     -0.401   0.688     -0.238      0.157


  SP_CHF          -0.2308      0.097     -2.370   0.018     -0.422     -0.040


  SP_CHRNKIDN      1.4570      0.121     12.066   0.000      1.220      1.694


  SP_CNCR          0.1308      0.148      0.882   0.378     -0.160      0.421


  SP_COPD          0.9319      0.116      8.040   0.000      0.705      1.159


  SP_DEPRESSN     -0.3392      0.098     -3.471   0.001     -0.531     -0.148


  SP_DIABETES     -0.8365      0.098     -8.507   0.000     -1.029     -0.644


  SP_ISCHMCHT     -1.1603      0.093    -12.432   0.000     -1.343     -0.977


  SP_OSTEOPRS     -0.2756      0.105     -2.635   0.008     -0.481     -0.071


  SP_RA_OA        -0.0401      0.112     -0.357   0.721     -0.260      0.180


  SP_STRKETIA      1.0162      0.166      6.119   0.000      0.691      1.342


  gender_2        -1.7737      0.075    -23.672   0.000     -1.921     -1.627


  ESRD_Y           0.4462      0.137      3.265   0.001      0.178      0.714

Approach using svm



In [119]:

    
from sklearn.svm import LinearSVC



In [120]:

    
mod = svm.LinearSVC(class_weight='balanced')



In [121]:

    
mod.fit(X, y)









    Out[121]:





LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)



In [122]:

    
conf_mat = generate_conf_mat(mod, X, y)



In [123]:

    
conf_mat









    Out[123]:







  
    
      
      0_pred
      1_pred
    
  
  
    
      0_actual
      0.73
      0.02
    
    
      1_actual
      0.17
      0.08



In [ ]:



In [ ]:

Dep. Variable:	HIGH_COST	No. Observations:	4700
Model:	Logit	Df Residuals:	4687
Method:	MLE	Df Model:	12
Date:	Thu, 21 Jun 2018	Pseudo R-squ.:	-0.2946
Time:	11:27:17	Log-Likelihood:	-1980.9
converged:	True	LL-Null:	-1530.1
		LLR p-value:	1.000

	coef	std err	z	P>\|z\|	[0.025	0.975]
SP_ALZHDMTA	-0.0405	0.101	-0.401	0.688	-0.238	0.157
SP_CHF	-0.2308	0.097	-2.370	0.018	-0.422	-0.040
SP_CHRNKIDN	1.4570	0.121	12.066	0.000	1.220	1.694
SP_CNCR	0.1308	0.148	0.882	0.378	-0.160	0.421
SP_COPD	0.9319	0.116	8.040	0.000	0.705	1.159
SP_DEPRESSN	-0.3392	0.098	-3.471	0.001	-0.531	-0.148
SP_DIABETES	-0.8365	0.098	-8.507	0.000	-1.029	-0.644
SP_ISCHMCHT	-1.1603	0.093	-12.432	0.000	-1.343	-0.977
SP_OSTEOPRS	-0.2756	0.105	-2.635	0.008	-0.481	-0.071
SP_RA_OA	-0.0401	0.112	-0.357	0.721	-0.260	0.180
SP_STRKETIA	1.0162	0.166	6.119	0.000	0.691	1.342
gender_2	-1.7737	0.075	-23.672	0.000	-1.921	-1.627
ESRD_Y	0.4462	0.137	3.265	0.001	0.178	0.714